library(data.table)
library(reshape2)
library(dplyr)
library(tidyverse)
library(ComplexHeatmap)
library(ggplot2)
library(circlize)
library(RColorBrewer)
library(viridis)
library(colorspace)
#courtesy from @Rich Scriven https://stackoverflow.com/questions/26045478/source-r-scripts-in-different-folders
sourceFolder <- function(folder, recursive = FALSE, ...)
{
files <- list.files(folder, pattern = "[.][rR]$",
full.names = TRUE, recursive = recursive)
if (!length(files))
stop(simpleError(sprintf('No R files in folder "%s"', folder)))
src <- invisible(lapply(files, source, ...))
message(sprintf('%s files sourced from folder "%s"', length(src), folder))
}
sourceFolder(
"/mnt/sda/alberto/projects/dev/comparABle/code/functions/", # change for definitive path once its final
recursive = TRUE
)
# source("/mnt/sda/alberto/projects/dev/minitools_R/code/functions/")
source('./code/R_functions/TF_analysis_functions/barplotcluster.R') # change for definitive path once its final
We start with our previous session.
load(
"./outputs/rda/01_plei_dataload.rda"
)
We will also load the TF annotation of Pristina leidyi.
plei_tfs <- read.table(
"./data/03_TFdata/20221025_pristina_TFs_curated.tsv",
header = T)[,1:2] #load column of plei ID and TF class
head(plei_tfs)
## id class
## 1 PrileiEVm006188t1 A_20
## 2 PrileiEVm016861t1 A_20
## 3 PrileiEVm018456t1 A_20
## 4 PrileiEVm018745t1 A_20
## 5 PrileiEVm007500t1 AP_2
## 6 PrileiEVm000374t1 ARID
We subset the gene expression pseudo-bulk matrix to retrieve expression from the TFs.
plei_tfs_cpm <-
plei_cpm[
rownames(plei_cpm) %in% plei_tfs$id,
1:49
]
colnames(plei_tfs_cpm) <- colnames(plei_cpm)[1:49]
plei_tfs_cpm <- as.data.frame(plei_tfs_cpm)
plei_tfs_cpm_hvg <-
plei_tfs_cpm[
rownames(plei_tfs_cpm) %in% plei_hvg$id,
]
We can browse the expression level of different TFs using this function.
plot_tf_plei <- function(x){
if(x %in% rownames(plei_tfs_cpm)) {
barplot(
height=unlist(c(
plei_tfs_cpm[
grep(
paste("^",x,"$",sep=""),
rownames(plei_tfs_cpm),
),
1:49
]
)),
col = plei_ctypes_col$col,
border = "#2F2F2F",
las=2,
cex.names=0.7,
main= paste(
x,
" (",
plei_tfs[grep(x,plei_tfs$id),2],
")\n",
sep=""
),
ylab="counts per million per cluster"
)} else {
stop("Name not in list of TFs.")
}
}
For example, the hnf4 gene;
hnf4 <- "PrileiEVm006891t1"
plot_tf_plei(hnf4)
# -gut,hnf4 : 006891, 004665, 008019
# -muscle: myoD: 008071, 014648
# -neurons: myt1l: 000431, pou6f :003917
# -UMAP feature of novel cell types:
# -lipoxygenase+ 001926
# -globin+ 002870 005469
# -eleocytes 005230 – ets4
# -vigilin+ 005896, 006907
# -polycystin 010161
As we have expression data of many transcription factors, we can visualise the global patterns of expression using heatmaps. We will do so by scaling the log-transformed expression of TFs to obtain a z-score.
plei_tfs_zsco <-
t(
scale(
t(
log(plei_tfs_cpm[,1:49]+1) # we add 1 for visualisation only
)
)
)
And using the ComplexHeatmap package:
We can explore the variability of TFs based at a TF class level.
# boxplot CV per TF class
plei_tfs_class_cv <- merge(
data.frame(
id = plei_tfs$id,
class = factor(
plei_tfs$class,
levels = unique(plei_tfs$class)
)
),
data.frame(
id = rownames(plei_tfs_cpm),
cv = apply(
plei_tfs_cpm[,1:49],
1,
function(x){
sd(x)/mean(x)
}
)
),
by = 1
)[,2:3]
head(plei_tfs_class_cv)
## class cv
## 1 zf_C2H2 0.5658257
## 2 zf_C2H2 1.5710520
## 3 DDT 0.7542196
## 4 MYB 0.7197607
## 5 bHLH 0.4712681
## 6 bHLH 1.3131212
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
With this we observe that there are a number of TF classes with very few assigned genes in Pristina.
We can focus on those with a minimum of four or five genes.
plei_tfs_mainclasses <-
names(
table(plei_tfs$class)[
table(plei_tfs$class) >= 4
]
)
plei_tfs_mainclasses <- sort(plei_tfs_mainclasses) # sort alphabetically
plei_tfs_mainclasses
## [1] "A_20" "bHLH" "bZIP" "CP2" "CSD"
## [6] "CUT" "DM" "E2F" "ETS" "Forkhead"
## [11] "HMG" "Homeodomain" "HTH" "IRF" "MH1"
## [16] "MYB" "P53" "PAX" "Pou" "RFX"
## [21] "RHD" "Runt" "RXR_like" "SAND" "SRF"
## [26] "T_box" "TFII" "THAP" "THR_like" "Tub"
## [31] "ZBTB" "zf_BED" "zf_C2H2" "zf_C4" "zf_GATA"
To find differences in TF class composition between cell clusters, we can resort to multivariate analysis two-way ANOVA to detect differences of TF expression between cell clusters, TF class, and the interaction of the two.
We will aggregate expression counts at the broad cell type level for this analysis.
broadtypes_sort <- list( # using here the one from the excel
piwi = c(1,2,3),
epidermis = c(4,5,6,7,8,9,10,11),
gut = c(12,13,14,15,16,17,18,19,20),
muscle = c(21,22,23,24),
neuron = c(25,26,27,28,29,30,31),
blood = c(32,33),
polycystin = c(34,35),
eleo = c(36,37,38),
chaetal = c(39,40),
lipox = 41,
vigil = 42,
lumbrok = 43,
carb = 44,
secretory = c(45,46),
arg = 47,
ldlrr = 48,
metaneph = 49
)
plei_cpm_broad <- sapply(
X = broadtypes_sort,
function(x) {if(length(x) > 1){rowSums(plei_cpm[,x])} else {plei_cpm[,x]}} #maybe should be done with counts_norm?
)
We generate our dataset for ANOVA analysis
anova_data <- merge(
plei_cpm_broad,
plei_tfs,
by.x = 0, by.y = 1)[,c(19,2:18)]
anova_data <- anova_data[anova_data$class %in% plei_tfs_mainclasses,]
anova_data <- reshape2::melt(anova_data)
## Using class as id variables
colnames(anova_data) <- c("class","ctype","counts")
anova_data$class <- factor(anova_data$class, levels = sort(unique(anova_data$class)))
anova_data$ctype <-
factor(
anova_data$ctype,
levels = colnames(plei_cpm_broad)
)
anova_groupby <- group_by(anova_data, class, ctype) %>%
summarise(
count = n(),
mean = mean(counts, na.rm = TRUE),
sd = sd(counts, na.rm = TRUE)
)
## `summarise()` has grouped output by 'class'. You can override using the
## `.groups` argument.
A quick check of the looks of our dataset:
summary(anova_data)
## class ctype counts
## zf_C2H2 :2754 piwi : 615 Min. : 0.000
## Homeodomain:1224 epidermis: 615 1st Qu.: 2.504
## bHLH :1105 gut : 615 Median : 23.605
## ZBTB :1003 muscle : 615 Mean : 88.771
## zf_C4 : 612 neuron : 615 3rd Qu.: 74.990
## Forkhead : 442 blood : 615 Max. :10846.604
## (Other) :3315 (Other) :6765
summary(anova_groupby)
## class ctype count mean
## A_20 : 17 piwi : 35 Min. : 2.00 Min. : 0.00
## bHLH : 17 epidermis: 35 1st Qu.: 4.00 1st Qu.: 18.03
## bZIP : 17 gut : 35 Median : 6.00 Median : 43.22
## CP2 : 17 muscle : 35 Mean : 17.57 Mean : 101.78
## CSD : 17 neuron : 35 3rd Qu.: 16.00 3rd Qu.: 110.30
## CUT : 17 blood : 35 Max. :162.00 Max. :1509.40
## (Other):493 (Other) :385
## sd
## Min. : 0.00
## 1st Qu.: 24.09
## Median : 48.76
## Mean : 130.81
## 3rd Qu.: 124.32
## Max. :2623.02
##
The anova analysis here
# Two-way ANOVA proper
res.aov2 <- aov(counts ~ ctype + class + ctype:class, data = anova_data)
summary(res.aov2)
## Df Sum Sq Mean Sq F value Pr(>F)
## ctype 16 71570079 4473130 63.123 <0.0000000000000002 ***
## class 34 44051561 1295634 18.284 <0.0000000000000002 ***
## ctype:class 544 67564552 124200 1.753 <0.0000000000000002 ***
## Residuals 9860 698712989 70863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As we see there is significant interactions between TF class and cell cluster.
And an two-way interaction plot of the means of counts by class AND cell cluster.
library(effects)
## Loading required package: carData
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
interaction.plot(
x.factor = anova_data$ctype, trace.factor = anova_data$class,
response = anova_data$counts, fun = mean,
type = "b", legend = TRUE,
pch=c(1,19),
col = rainbow(35)
)
plot(
allEffects(res.aov2),
multiline=TRUE
)
Tukey test to compare the means of TF counts by class, by cell cluster
# Tukey comparison of means
anova_comparisons_interaction <- TukeyHSD(res.aov2, which = "ctype:class")
anova_comparisons_interaction$`ctype:class` <-
anova_comparisons_interaction$`ctype:class`[
anova_comparisons_interaction$`ctype:class`[,4] < 0.05 ,
]
From the interaction test, we extract the significant results for significant differences between the same TF class across different cell clusters.
# put this all together with dplyr ...
# also think this better and implement a FUNCTION
anova_comparisons_interaction_DF <- data.frame(
comparison = rownames(anova_comparisons_interaction$`ctype:class`),
as.data.frame(
anova_comparisons_interaction$`ctype:class`
)
)
anova_comparisons_interaction_DF <- data.frame(
anova_comparisons_interaction_DF,
A = sub("-..*", "", anova_comparisons_interaction_DF$comparison),
B = sub("..*-", "", anova_comparisons_interaction_DF$comparison)
)
anova_comparisons_interaction_DF$class_A <-
sub(".*:","",anova_comparisons_interaction_DF$A)
anova_comparisons_interaction_DF$class_B <-
sub(".*:","",anova_comparisons_interaction_DF$B)
anova_comparisons_interaction_DF <-
anova_comparisons_interaction_DF[
anova_comparisons_interaction_DF$class_A ==
anova_comparisons_interaction_DF$class_B ,
]
table(anova_comparisons_interaction_DF$class_A) # or class_B, its the same anyway
##
## bHLH bZIP CSD ETS HMG Homeodomain
## 10 39 46 29 14 12
## MYB zf_C2H2 zf_C4
## 5 17 1
plei_tfs_signif_class <- names(table(anova_comparisons_interaction_DF$class_A))
A quick visualisation of what are the TF classes that explain the most differences between cell clusters:
barplot(
sort(table(anova_comparisons_interaction_DF$class_A)), # or class_B, its the same anyway
col = rev(brewer.pal(9,"Spectral")),
las = 2
)
And we see that all these classes are already in the set of TF classes we will be exploring
plei_tfs_signif_class %in% plei_tfs_mainclasses
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
To visualise these differences in TF abundance at a TF class level, we retrieve the expression matrix at the class level. Below is a quick look too at the table (last columns, including class)
plei_tfs_cpm_topclass <-
merge(
plei_tfs_cpm,
plei_tfs,
by.x = 0,
by.y = 1,
) %>%
column_to_rownames("Row.names") %>%
filter(class %in% plei_tfs_mainclasses)
head(plei_tfs_cpm_topclass[,45:50])
## 34_secretory_1 44_secretory_2 32_arginase_pos_cells
## PrileiEVm000032t1 13.97038 39.97282 96.8828
## PrileiEVm000093t1 0.00000 0.00000 0.0000
## PrileiEVm000118t1 83.82230 39.97282 24.2207
## PrileiEVm000154t1 83.82230 79.94564 121.1035
## PrileiEVm000159t1 13.97038 0.00000 72.6621
## PrileiEVm000175t1 0.00000 79.94564 96.8828
## 35_ldlrr_pos_cells 37_metanephridia class
## PrileiEVm000032t1 33.62927 82.42664 zf_C2H2
## PrileiEVm000093t1 33.62927 0.00000 zf_C2H2
## PrileiEVm000118t1 168.14635 164.85328 MYB
## PrileiEVm000154t1 168.14635 274.75547 bHLH
## PrileiEVm000159t1 33.62927 0.00000 bHLH
## PrileiEVm000175t1 67.25854 0.00000 CSD
We will generate a matrix to count how many genes of each TF class are expressed in each cell cluster.
plei_tfs_ngenes <-
apply(
plei_tfs_cpm_topclass[1:49],
2,
function(x){ ifelse(x > 0, 1, 0) }
)
row.names(plei_tfs_ngenes) <- plei_tfs_cpm_topclass$class
plei_tfs_ngenes <-
aggregate(
plei_tfs_ngenes[, 1:49],
by = list(Category = rownames(plei_tfs_ngenes)),
FUN = sum) %>%
arrange(Category) %>%
column_to_rownames("Category")
We also generate a table of all the cpms per class and per cell cluster. We will aggregate (sum) all cpms by TF class, for each cell cluster separately. (We will also generate a normalised )
plei_tfs_expgenes <-
aggregate(
plei_tfs_cpm_topclass[, 1:49],
by = list(Category = plei_tfs_cpm_topclass$class),
FUN = sum) %>%
arrange(Category) %>%
column_to_rownames("Category")
We divide the cpms/class by the numexp/class, thus retrieving the cpms per class normalised by the number of expressed genes from a given class.
plei_tfs_expngenes <-
plei_tfs_expgenes / plei_tfs_ngenes
plei_tfs_expngenes <- apply(plei_tfs_expngenes, 2, function(x){ifelse(is.nan(x),0,x)})
plei_tf_EXPNGEN <-
apply(plei_tfs_expngenes, 2, function(x) {
x / sum(x)
})
A barplot to show the prominence/prevalence of TF classes on each cell cluster.
We will save the important bits for further analysis in the rest of markdowns.
save(
# cell type table(s)
plei_counts,
plei_ctypes,
plei_ctypes_col,
# gene expression data
plei_cpm,
plei_tfs_cpm,
plei_tfs_cpm_topclass,
# tf data
plei_tfs,
plei_tfs_mainclasses,
plei_tfs_signif_class,
# anova analysis
res.aov2,
anova_comparisons_interaction,
anova_comparisons_interaction_DF,
# visual annotations
ctypes_rowAnno,
clu_ha,
#pick color palette for TFs
# destination
file = paste0(
"./outputs/rda/",
"02_tf_analysis.rda"
)
)